import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import DiscriminationThreshold
from yellowbrick.features import Manifold
breastCancer = load_breast_cancer()
print(breastCancer.DESCR)
breastCancerDF = pd.DataFrame(breastCancer['data'], columns=breastCancer['feature_names'])
breastCancerDF['TARGET'] = breastCancer['target']
breastCancerDF.head()
pandas_profiling.ProfileReport(breastCancerDF)
X = breastCancerDF.drop('TARGET', axis=1)
y = breastCancerDF['TARGET']
plt.figure(figsize=(15,10))
visualizer = Manifold(manifold='tsne', target='discrete')
visualizer.fit_transform(X, y)
visualizer.poof()
select = [
#'mean_radius',
'mean_texture',
#'mean_perimeter',
'mean_area',
'mean_smoothness',
'mean_compactness',
#'mean_concavity',
'mean_concave_points',
'mean_symmetry',
'mean_fractal_dimension',
#'radius_error',
'texture_error',
#'perimeter_error',
'area_error',
'smoothness_error',
'compactness_error',
'concavity_error',
'concave_points_error',
'symmetry_error',
'fractal_dimension_error',
#'worst_radius',
#'worst_texture',
#'worst_perimeter',
#'worst_area',
'worst_smoothness',
'worst_compactness',
'worst_concavity',
#'worst_concave_points',
'worst_symmetry',
'worst_fractal_dimension'
]
X = breastCancerDF[select]
y = breastCancerDF['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1337)
featuresNumeric = X.columns.tolist()
transformerNumeric = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median'))
])
featuresCategorical = []
transformerCategorical = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('numeric', transformerNumeric, featuresNumeric),
('categorical', transformerCategorical, featuresCategorical)
])
logistic = LogisticRegression()
logPipe = Pipeline([
('preprocess', preprocessor),
('logistic', logistic)
])
logPipe.fit(X_train, y_train)
preds = logPipe.predict(X_test)
proba = logPipe.predict_proba(X_test)
print('Score Train {:.3f}'.format(logPipe.score(X_train, y_train)))
print('Score Test {:.3f}'.format(logPipe.score(X_test, y_test)))
predDF = pd.DataFrame(proba, columns=['Probability False', 'Probability True'])
predDF['Predicted Class'] = preds
predDF.head()
plt.figure(figsize=(15,10))
visualizer = DiscriminationThreshold(logPipe)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data
print('Test Accuracy = {:.3f}'.format(logPipe.score(X_test, y_test)))
print('Log Loss: {:.3f}'.format(log_loss(y_test, preds)))
confusion_matrix = confusion_matrix(y_test, preds)
print(confusion_matrix)
rpt = classification_report(y_test, preds)
print(rpt)
logit_roc_auc = roc_auc_score(y_test, preds)
fpr, tpr, thresholds = roc_curve(y_test, proba[:,1])
plt.figure(figsize=(15,10))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.style.use('seaborn-whitegrid')
plt.show()
print('AUC: {:.3f}'.format(logit_roc_auc))
print('Accuracy on Test : {:.3f}'.format(acc))
print('Baseline on Test : {:.3f}'.format(base))
print("Cohen's Kappa : {:.3f}".format(cohen_kappa_score(logPipe.predict(X_test), y_test)))
plt.figure(figsize=(8,6))
visualizer = ConfusionMatrix(logPipe, classes=[0,1], percent=True, fontsize=20)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof()
plt.figure(figsize=(10,8))
visualizer = ClassificationReport(logPipe, classes=[0,1], support=True)
visualizer.fit(X_train, y_train) # Fit the visualizer and the model
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.poof() # Draw/show/poof the data
plt.figure(figsize=(15,10))
visualizer = ROCAUC(logPipe, classes=['No Cancer', 'Cancer'])
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
plt.figure(figsize=(8,6))
visualizer = ClassPredictionError(logPipe, classes=[0,1])
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
model = sm.Logit(y_train, X_train)
result = model.fit()
print(result.summary2())
preds20 = result.predict(X_test)
metricAUC = []
metricF1 = []
metricLogLoss = []
for features in range(20):
rfe = RFE(logistic, features + 1, 1)
rfe = rfe.fit(X_train, y_train)
features_bool = np.array(rfe.support_)
features = np.array(X.columns)
topFeatures = features[features_bool].tolist()
model = sm.Logit(y_train, X_train[topFeatures])
result = model.fit()
preds = result.predict(X_test[topFeatures])
fpr, tpr, thresholds = roc_curve(y_test, preds, pos_label=1)
metricAUC.append(auc(fpr, tpr))
metricF1.append(f1_score(y_test, preds > 0.5))
metricLogLoss.append(log_loss(y_test, preds))
pd.DataFrame(metricAUC, columns=['Features']).plot(figsize=(15,10))
plt.title('F1 Score for Top N Features')
plt.ylabel('AUC')
plt
pd.DataFrame(metricF1, columns=['Features']).plot(figsize=(15,10))
plt.title('F1 Score for Top N Features')
plt.ylabel('F1 Score')
plt
pd.DataFrame(metricLogLoss, columns=['Features']).plot(figsize=(15,10))
plt.title('LogLoss for Top N Features')
plt.ylabel('LogLoss Score')
plt
logistic = LogisticRegression()
logPipeRFE = Pipeline([
('preprocess', preprocessor),
('logistic', logistic)
])
rfe = RFE(logistic, 15, 1)
rfe = rfe.fit(X_train, y_train)
features_bool = np.array(rfe.support_)
features = np.array(X.columns)
topFeatures = features[features_bool].tolist()
topFeatures
model = sm.Logit(y_train, X_train[topFeatures])
result = model.fit()
print(result.summary2())
preds15 = result.predict(X_test[topFeatures])
resultDF = pd.DataFrame(columns=['LogLoss', 'AUC'])
fpr, tpr, thresholds = roc_curve(y_test, preds20, pos_label=1)
area_under_curve = auc(fpr, tpr)
logloss = log_loss(y_test, preds20)
resultDF.loc['20 Feature Model'] = [logloss, area_under_curve]
print('AUC: {:.3f}'.format(area_under_curve))
print('Log Loss: {:.3f}'.format(logloss))
preds20_class = preds20 > 0.5
print(classification_report(y_test, preds20_class))
fpr, tpr, thresholds = roc_curve(y_test, preds15, pos_label=1)
area_under_curve = auc(fpr, tpr)
logloss = log_loss(y_test, preds15)
resultDF.loc['15 Feature Model'] = [logloss, area_under_curve]
print('AUC: {:.3f}'.format(area_under_curve))
print('Log Loss: {:.3f}'.format(logloss))
preds15_class = preds15 > 0.5
print(classification_report(y_test, preds15_class))
logistic = LogisticRegression()
regPipe = Pipeline([
('preprocess', preprocessor),
('logistic', logistic)
])
param_grid = {
'logistic__penalty' : ['l1'],
'logistic__C' : np.logspace(-4, 4, 20)
}
clf = GridSearchCV(regPipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, y_train)
best_clf.best_params_
model = sm.Logit(y_train, X_train)
result = model.fit_regularized(method='l1', alpha=1.0, L1_wt=29.763)
print(result.summary2())
predsReg = result.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, predsReg, pos_label=1)
area_under_curve = auc(fpr, tpr)
logloss = log_loss(y_test, predsReg)
resultDF.loc['Regularized Model'] = [logloss, area_under_curve]
print('AUC: {:.3f}'.format(area_under_curve))
print('Log Loss: {:.3f}'.format(logloss))
predsReg_class = predsReg > 0.5
print(classification_report(y_test, predsReg_class))
resultDF.sort_values('LogLoss')